Install and load R packages

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

if (!require("GEOquery", quietly = TRUE))
    BiocManager::install("GEOquery") 

if (!require("rScudo", quietly = TRUE))
    BiocManager::install("rScudo") 

library(tidyverse)
library(dplyr)
library(GEOquery)
library(rScudo)
library(plotly)
library(sessioninfo)
library(ggplot2)
library(factoextra)

R session information

print('Reproducibility information:') 
## [1] "Reproducibility information:"
Sys.time()
## [1] "2024-09-05 11:48:08 CEST"
proc.time()
##    user  system elapsed 
##   1.195   0.111   1.768
options(width = 120)
session_info()
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.4.0 (2024-04-24)
##  os       macOS Sonoma 14.6.1
##  system   aarch64, darwin23.4.0
##  ui       unknown
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Europe/Rome
##  date     2024-09-05
##  pandoc   3.1.11 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package      * version date (UTC) lib source
##  Biobase      * 2.64.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.0)
##  BiocGenerics * 0.50.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.0)
##  BiocManager  * 1.30.24 2024-08-20 [1] CRAN (R 4.4.0)
##  bslib          0.8.0   2024-07-29 [1] CRAN (R 4.4.0)
##  cachem         1.1.0   2024-05-16 [1] CRAN (R 4.4.0)
##  cli            3.6.3   2024-06-21 [1] CRAN (R 4.4.0)
##  colorspace     2.1-1   2024-07-26 [1] CRAN (R 4.4.0)
##  data.table     1.15.4  2024-03-30 [1] CRAN (R 4.4.0)
##  digest         0.6.37  2024-08-19 [1] CRAN (R 4.4.0)
##  dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.4.0)
##  evaluate       0.24.0  2024-06-10 [1] CRAN (R 4.4.0)
##  factoextra   * 1.0.7   2020-04-01 [1] CRAN (R 4.4.0)
##  fansi          1.0.6   2023-12-08 [1] CRAN (R 4.4.0)
##  fastmap        1.2.0   2024-05-15 [1] CRAN (R 4.4.0)
##  forcats      * 1.0.0   2023-01-29 [1] CRAN (R 4.4.0)
##  generics       0.1.3   2022-07-05 [1] CRAN (R 4.4.0)
##  GEOquery     * 2.72.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.0)
##  ggplot2      * 3.5.1   2024-04-23 [1] CRAN (R 4.4.0)
##  ggrepel        0.9.5   2024-01-10 [1] CRAN (R 4.4.0)
##  glue           1.7.0   2024-01-09 [1] CRAN (R 4.4.0)
##  gtable         0.3.5   2024-04-22 [1] CRAN (R 4.4.0)
##  hms            1.1.3   2023-03-21 [1] CRAN (R 4.4.0)
##  htmltools      0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
##  htmlwidgets    1.6.4   2023-12-06 [1] CRAN (R 4.4.0)
##  httr           1.4.7   2023-08-15 [1] CRAN (R 4.4.0)
##  jquerylib      0.1.4   2021-04-26 [1] CRAN (R 4.4.0)
##  jsonlite       1.8.8   2023-12-04 [1] CRAN (R 4.4.0)
##  knitr          1.48    2024-07-07 [1] CRAN (R 4.4.0)
##  lazyeval       0.2.2   2019-03-15 [1] CRAN (R 4.4.0)
##  lifecycle      1.0.4   2023-11-07 [1] CRAN (R 4.4.0)
##  limma          3.60.4  2024-07-17 [1] Bioconductor 3.19 (R 4.4.0)
##  lubridate    * 1.9.3   2023-09-27 [1] CRAN (R 4.4.0)
##  magrittr       2.0.3   2022-03-30 [1] CRAN (R 4.4.0)
##  munsell        0.5.1   2024-04-01 [1] CRAN (R 4.4.0)
##  pillar         1.9.0   2023-03-22 [1] CRAN (R 4.4.0)
##  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.4.0)
##  plotly       * 4.10.4  2024-01-13 [1] CRAN (R 4.4.0)
##  purrr        * 1.0.2   2023-08-10 [1] CRAN (R 4.4.0)
##  R6             2.5.1   2021-08-19 [1] CRAN (R 4.4.0)
##  Rcpp           1.0.13  2024-07-17 [1] CRAN (R 4.4.0)
##  readr        * 2.1.5   2024-01-10 [1] CRAN (R 4.4.0)
##  rlang          1.1.4   2024-06-04 [1] CRAN (R 4.4.0)
##  rmarkdown      2.28    2024-08-17 [1] CRAN (R 4.4.0)
##  rScudo       * 1.20.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.0)
##  rstudioapi     0.16.0  2024-03-24 [1] CRAN (R 4.4.0)
##  sass           0.4.9   2024-03-15 [1] CRAN (R 4.4.0)
##  scales         1.3.0   2023-11-28 [1] CRAN (R 4.4.0)
##  sessioninfo  * 1.2.2   2021-12-06 [1] CRAN (R 4.4.0)
##  statmod        1.5.0   2023-01-06 [1] CRAN (R 4.4.0)
##  stringi        1.8.4   2024-05-06 [1] CRAN (R 4.4.0)
##  stringr      * 1.5.1   2023-11-14 [1] CRAN (R 4.4.0)
##  tibble       * 3.2.1   2023-03-20 [1] CRAN (R 4.4.0)
##  tidyr        * 1.3.1   2024-01-24 [1] CRAN (R 4.4.0)
##  tidyselect     1.2.1   2024-03-11 [1] CRAN (R 4.4.0)
##  tidyverse    * 2.0.0   2023-02-22 [1] CRAN (R 4.4.0)
##  timechange     0.3.0   2024-01-18 [1] CRAN (R 4.4.0)
##  tzdb           0.4.0   2023-05-12 [1] CRAN (R 4.4.0)
##  utf8           1.2.4   2023-10-22 [1] CRAN (R 4.4.0)
##  vctrs          0.6.5   2023-12-01 [1] CRAN (R 4.4.0)
##  viridisLite    0.4.2   2023-05-02 [1] CRAN (R 4.4.0)
##  withr          3.0.1   2024-07-31 [1] CRAN (R 4.4.0)
##  xfun           0.47    2024-08-17 [1] CRAN (R 4.4.0)
##  xml2           1.3.6   2023-12-04 [1] CRAN (R 4.4.0)
##  yaml           2.3.10  2024-07-26 [1] CRAN (R 4.4.0)
## 
##  [1] /opt/homebrew/lib/R/4.4/site-library
##  [2] /opt/homebrew/Cellar/r/4.4.0/lib/R/library
## 
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

Data selection

The analysis will use the dataset GSE20437 obtained from GEO.The dataset is generated from Affymetrix HU133A microarrays and contains 42 tissue samples.

In detail, the data includes:

Note that sample numbers correspond to individual patient samples.

# download the GSE20437 expression data series
gse <- getGEO("GSE20437", destdir= './data/', getGPL = F)
# getGEO returns a list of expression objects, but...
length(gse) 
## [1] 1
# shows us there is only one object in it. 
# We assign it to the same variable.
gse <- gse[[1]]

Exploratory analysis

# show what we have:
show(gse)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22283 features, 42 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM512539 GSM512540 ... GSM512580 (42 total)
##   varLabels: title geo_accession ... tissue:ch1 (38 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 20197764 
## Annotation: GPL96

The actual expression data are accessible in the exprs section of gse, an Expression Set and the generic data class that BioConductor uses for expression data.

head(exprs(gse)) 
##           GSM512539 GSM512540 GSM512541 GSM512542 GSM512543 GSM512544 GSM512545 GSM512546 GSM512547 GSM512548 GSM512549
## 1007_s_at    2461.4    3435.7    1932.5    2377.7    3055.3    2978.1    2348.5    2963.9    2776.9    3088.9    3033.3
## 1053_at        26.7     159.0      31.2     140.7      69.9      98.5      37.0      59.9      86.7     107.2      64.0
## 117_at         82.6     243.4     150.2      95.1     209.3     103.4      91.2     168.4     162.7     203.2     143.7
## 121_at        942.3     897.5     840.8     870.9     685.4     791.8     886.5     954.2     843.1     775.3     847.6
## 1255_g_at      71.8      87.9      75.4      58.1      31.8      40.3      70.5      43.3      51.6      42.6      74.9
## 1294_at       630.2     571.4     346.3     679.9    1289.3     421.1     417.6     811.6     778.1     393.2     995.4
##           GSM512550 GSM512551 GSM512552 GSM512553 GSM512554 GSM512555 GSM512556 GSM512557 GSM512558 GSM512559 GSM512560
## 1007_s_at    3037.1    3545.8    3322.6    1963.7    3609.6    2078.9    4138.6    4260.7    2453.6    2709.0    2612.5
## 1053_at        82.9      97.7      69.7      82.0      45.6      84.5      31.7      37.4      82.4     204.8     119.3
## 117_at        113.5      80.0     186.4     106.6     145.6     144.4     133.6     278.6     173.0     147.8     186.0
## 121_at        912.2     911.6     862.4     705.0     984.6     853.8     846.8    1273.0     833.6     908.1     806.2
## 1255_g_at      53.7      30.5      15.2      42.5      76.6      88.2      90.6      65.8      25.8      77.5      84.3
## 1294_at       987.7     938.5     924.6     480.8    1054.1     632.0     448.0    1345.2    1248.9     405.7     647.5
##           GSM512561 GSM512562 GSM512563 GSM512564 GSM512565 GSM512566 GSM512567 GSM512568 GSM512569 GSM512570 GSM512571
## 1007_s_at    4340.1    3155.3    2390.3    2738.8    3233.1    2836.6    2915.4    3457.5    2798.7    4370.2    2467.3
## 1053_at        76.7     100.3     115.4      14.1      47.6      77.1      47.1      47.0      83.2      40.2      80.3
## 117_at        168.0      95.2      73.6     122.7     107.6     120.9     143.4      92.5      72.1     131.8     156.4
## 121_at        827.0     629.4     709.2     305.6     877.4     425.7     643.8     771.3     681.1     812.7     533.4
## 1255_g_at      87.9      44.6      59.3      12.0      82.1      59.2      62.2      28.3      97.6       8.1      17.9
## 1294_at      2218.1    1321.1     606.7    1709.9     980.8    1268.4     955.8    1157.5     888.6    1130.8     905.1
##           GSM512572 GSM512573 GSM512574 GSM512575 GSM512576 GSM512577 GSM512578 GSM512579 GSM512580
## 1007_s_at    3669.5    3310.1    3942.2    4520.4    3596.1    2989.0    3164.5    2764.3    4258.5
## 1053_at        24.1       8.8      44.6      54.7      56.7      89.9      63.4      57.0      59.5
## 117_at        165.8     141.6      97.1     132.7     124.3     210.5     131.4      89.6     123.3
## 121_at        746.9    1090.3    1008.7     718.6     988.4     295.9     957.3     630.8     869.2
## 1255_g_at      53.0      39.9      11.0      50.2      60.0      34.3      33.5      61.7      50.4
## 1294_at      1138.5     483.0    1326.5    1179.4     668.3     863.2    1055.5    1287.6    1127.8

To conveniently access the data rows and columns present in exprs(gse), this matrix is assigned to its own variable ex.

# exprs (gse) is a matrix that we can assign to its own variable, to
# conveniently access the data rows and columns
ex <- exprs(gse)
dim(ex) # 42 sample, 22283 genes
## [1] 22283    42

The dataset contains gene expression data of 22283 genes (rows) from 42 patients (columns).

Pre-processing

# Analyze value distributions
#boxplot(ex)
boxplot(ex, main = 'Boxplot of the data before normalization')

The boxplot shows that scaling is necessary. So, in this case, I try to apply a log transformation to the data.

ex2<-log(ex)
ex2 <- na.omit(as.matrix(ex2))
boxplot(ex2, main = 'Boxplot of the data after applying a logarithmic transformation')
Boxplot of the data after applying a logarithmic transformation

Boxplot of the data after applying a logarithmic transformation

From the boxplot after the log transformation, I can see that there is some variation in the median of the samples. So, I try to apply a median normalization to the data after the log transformation.

# MEDIAN NORMALIZATION
channel.medians=apply(log(ex),2,median)
normalized.log.ex=sweep(log(ex),2,channel.medians,"-")

# boxplot post median normalization on ex
boxplot(normalized.log.ex, main = 'Boxplot of the data after median normalization')
Boxplot of the data after median normalization

Boxplot of the data after median normalization

PCA

PCA is a dimensionality reduction technique that allows to condense thousands of dimensions into just two or three. For the dataset’s samples, the PCA scores display the coordinates in relation to these additional dimensions.

pca <- prcomp(t(normalized.log.ex))

#summary(pca)
#screeplot(pca)

To get the summary of the PCA and the plot showing the variance explained by the first 10 components, it is possible to use the functions commented in the chunks above.

However, using ggplot2 and factoextra packages is possible to get a more concise and informative plot reporting the same information.

pcaVar <- get_eig(pca)
pcaVar <- pcaVar$variance.percent[1:10]
screeDf <- data.frame("Dimensions" = as.factor(seq(1,10)),
                      "Percentages" = pcaVar,
                      "Labels" = paste(round(pcaVar, 2), "%"))

p <- ggplot(data = screeDf, aes(x=Dimensions, y=Percentages))+
  geom_bar(stat = "identity", fill = "#d1105a")+
  geom_text(aes(label=Labels), vjust=-0.5, color="black", size=3.6)+
  ggtitle("Scree Plot")+
  ylab("Percentage of variance explained")+
  scale_x_discrete(labels = as.factor(seq(1,10)))
p

# draw PCA plot
group <- c(rep("cadetblue1",18), rep("red",18), rep("purple",6) ) # vector of colors based on the order of my data
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)

The vector group used in the PCA plot is based on the data. The samples corresponding to the colors are the following:

  • Light blue: reduction mammoplasty (RM) breast epithelium samples

  • Red: histologically normal (HN) epithelial samples from breast cancer patient

  • Purple: histologically normal breast epithelium (NlEpi) from prophylactic mastectomy patient samples

Let’s try to explore an interactive PCA plot.

components<-pca[["x"]]
components<-data.frame(components)
type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
components<-cbind(components, type )

fig <- plot_ly(components, x=~PC1, y=~PC2, 
               color=type,colors=c('cadetblue1', 'red','purple'), 
               type='scatter',mode='markers')
fig
fig2 <- plot_ly(components, x=~PC1, y=~PC2, z=~PC3, 
                color=type, colors=c('cadetblue1', 'red','purple'),
                mode='markers', marker = list(size = 4))
fig2
fig3 <- plot_ly(components, x=~PC1, y=~PC3, 
                color=type, colors=c('cadetblue1', 'red','purple'),
                type='scatter',mode='markers')
fig3

UMAP

umap_data <- umap::umap(t(normalized.log.ex), 
                        n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),#or the square root of the rows 
                        min_dist = 0.1,         
                        metric = "euclidean",    #you can change it
                        n_components = 2) #used the default ones! 

umap_df <- data.frame(umap_data$layout) %>% tibble::rownames_to_column('Samples')
umap_df$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))

library(plotly)

figUmap <- plot_ly(umap_df, 
                 x = ~X1, y = ~X2, color = umap_df$type,
                 colors = c('cadetblue1', 'red','purple'),   
                 mode = 'markers',
                 size=1)

# Display the 2D scatter plot
figUmap
umap_data_3D <- umap::umap(t(normalized.log.ex), 
                        n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),
                      #or the square root of the rows, provato anche con 5 fa pena, anche 15 abbastanza pena  
                        min_dist = 0.1,         
                        metric = "euclidean",    #you can change it
                        n_components = 3) #used the default ones! 

umap_df_3D <- data.frame(umap_data_3D$layout) %>% tibble::rownames_to_column('Samples')
umap_df_3D$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))

figUmap_3D <- plot_ly(umap_df_3D, 
                 x = ~X1, y = ~X2, z=~X3, color = umap_df_3D$type,
                 colors = c('cadetblue1', 'red','purple'),   
                 mode = 'markers',
                 size=1) %>% layout(title = 'Umap 1 control-tumors 3D')

# Display the 2D scatter plot
figUmap_3D

```